nb_kernel303_x86_64_sse2.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 2,073 行 · 第 1/5 页

S
2,073
字号
        mulpd   %xmm15,%xmm9 ## first iteration for rinvH1        mulpd   %xmm15,%xmm10 ## first iteration for rinvH2    mulpd   %xmm15,%xmm11 ## first iteration for rinvM    ## second iteration step            movapd  %xmm9,%xmm2        movapd  %xmm10,%xmm5    movapd  %xmm11,%xmm8        mulpd   %xmm2,%xmm2 ## lu*lu        mulpd   %xmm5,%xmm5 ## lu*lu    mulpd   %xmm8,%xmm8 ## lu*lu        movapd  nb303_three(%rsp),%xmm1        movapd  %xmm1,%xmm4    movapd  %xmm1,%xmm7        mulpd   %xmm0,%xmm2 ## rsq*lu*lu        mulpd   %xmm3,%xmm5 ## rsq*lu*lu     mulpd   %xmm6,%xmm8 ## rsq*lu*lu        subpd   %xmm2,%xmm1        subpd   %xmm5,%xmm4    subpd   %xmm8,%xmm7 ## 3-rsq*lu*lu        mulpd   %xmm1,%xmm9        mulpd   %xmm4,%xmm10    mulpd   %xmm7,%xmm11 ## lu*(3-rsq*lu*lu)        movapd  nb303_half(%rsp),%xmm15        mulpd   %xmm15,%xmm9 ##  rinvH1        mulpd   %xmm15,%xmm10 ##   rinvH2    mulpd   %xmm15,%xmm11 ##   rinvM        movapd  %xmm9,nb303_rinvH1(%rsp)        movapd  %xmm10,nb303_rinvH2(%rsp)        movapd  %xmm11,nb303_rinvM(%rsp)        ## interactions     ## rsq in xmm0,xmm3,xmm6      ## rinv in xmm9, xmm10, xmm11    movapd nb303_tsc(%rsp),%xmm1    mulpd  %xmm9,%xmm0 ## r    mulpd  %xmm10,%xmm3    mulpd  %xmm11,%xmm6    mulpd  %xmm1,%xmm0 ## rtab    mulpd  %xmm1,%xmm3    mulpd  %xmm1,%xmm6    ## truncate and convert to integers    cvttpd2dq %xmm0,%xmm1    cvttpd2dq %xmm3,%xmm4    cvttpd2dq %xmm6,%xmm7    ## convert back to float    cvtdq2pd  %xmm1,%xmm2    cvtdq2pd  %xmm4,%xmm5    cvtdq2pd  %xmm7,%xmm8    ## multiply by 4    pslld   $2,%xmm1    pslld   $2,%xmm4    pslld   $2,%xmm7    ## move to integer registers    pshufd $1,%xmm1,%xmm13    pshufd $1,%xmm4,%xmm14    pshufd $1,%xmm7,%xmm15    movd    %xmm1,%r8d    movd    %xmm4,%r10d    movd    %xmm7,%r12d    movd    %xmm13,%r9d    movd    %xmm14,%r11d    movd    %xmm15,%r13d    movq nb303_VFtab(%rbp),%rsi    ## calculate eps    subpd     %xmm2,%xmm0    subpd     %xmm5,%xmm3    subpd     %xmm8,%xmm6    movapd    %xmm0,%xmm12 ## epsH1    movapd    %xmm3,%xmm13 ## epsH2    movapd    %xmm6,%xmm14 ## epsM    ## Load LOTS of table data    movlpd (%rsi,%r8,8),%xmm0    movlpd 8(%rsi,%r8,8),%xmm1    movlpd 16(%rsi,%r8,8),%xmm2    movlpd 24(%rsi,%r8,8),%xmm3    movlpd (%rsi,%r10,8),%xmm4    movlpd 8(%rsi,%r10,8),%xmm5    movlpd 16(%rsi,%r10,8),%xmm6    movlpd 24(%rsi,%r10,8),%xmm7    movlpd (%rsi,%r12,8),%xmm8    movlpd 8(%rsi,%r12,8),%xmm9    movlpd 16(%rsi,%r12,8),%xmm10    movlpd 24(%rsi,%r12,8),%xmm11    movhpd (%rsi,%r9,8),%xmm0    movhpd 8(%rsi,%r9,8),%xmm1    movhpd 16(%rsi,%r9,8),%xmm2    movhpd 24(%rsi,%r9,8),%xmm3    movhpd (%rsi,%r11,8),%xmm4    movhpd 8(%rsi,%r11,8),%xmm5    movhpd 16(%rsi,%r11,8),%xmm6    movhpd 24(%rsi,%r11,8),%xmm7    movhpd (%rsi,%r13,8),%xmm8    movhpd 8(%rsi,%r13,8),%xmm9    movhpd 16(%rsi,%r13,8),%xmm10    movhpd 24(%rsi,%r13,8),%xmm11    ## table data ready in xmm0-xmm3 , xmm4-xmm7 , and xmm8-xmm11    mulpd  %xmm12,%xmm3  ## Heps    mulpd  %xmm13,%xmm7    mulpd  %xmm14,%xmm11    mulpd  %xmm12,%xmm2  ## Geps    mulpd  %xmm13,%xmm6    mulpd  %xmm14,%xmm10    mulpd  %xmm12,%xmm3  ## Heps2    mulpd  %xmm13,%xmm7    mulpd  %xmm14,%xmm11    addpd  %xmm2,%xmm1  ## F+Geps    addpd  %xmm6,%xmm5    addpd  %xmm10,%xmm9    addpd  %xmm3,%xmm1  ## F+Geps+Heps2 = Fp    addpd  %xmm7,%xmm5    addpd  %xmm11,%xmm9    addpd  %xmm3,%xmm3   ## 2*Heps2    addpd  %xmm7,%xmm7    addpd  %xmm11,%xmm11    addpd  %xmm2,%xmm3   ## 2*Heps2+Geps    addpd  %xmm6,%xmm7    addpd  %xmm10,%xmm11    addpd  %xmm1,%xmm3  ## FF = Fp + 2*Heps2 + Geps    addpd  %xmm5,%xmm7    addpd  %xmm9,%xmm11    mulpd  %xmm12,%xmm1  ## eps*Fp    mulpd  %xmm13,%xmm5    mulpd  %xmm14,%xmm9    movapd nb303_qqH(%rsp),%xmm12    movapd nb303_qqM(%rsp),%xmm13    addpd  %xmm0,%xmm1    ## VV    addpd  %xmm4,%xmm5    addpd  %xmm8,%xmm9    mulpd  %xmm12,%xmm1  ## VV*qq = vcoul    mulpd  %xmm12,%xmm5    mulpd  %xmm13,%xmm9    mulpd  %xmm12,%xmm3   ## FF*qq = fij    mulpd  %xmm12,%xmm7    mulpd  %xmm13,%xmm11    ## accumulate vctot    addpd  nb303_vctot(%rsp),%xmm1    addpd  %xmm9,%xmm5    addpd  %xmm5,%xmm1    movapd %xmm1,nb303_vctot(%rsp)    movapd nb303_tsc(%rsp),%xmm10    mulpd  %xmm10,%xmm3 ## fscal    mulpd  %xmm10,%xmm7    mulpd  %xmm11,%xmm10    xorpd %xmm4,%xmm4    xorpd %xmm8,%xmm8    xorpd %xmm11,%xmm11    subpd %xmm3,%xmm4    subpd %xmm7,%xmm8    subpd %xmm10,%xmm11    mulpd nb303_rinvH1(%rsp),%xmm4    mulpd nb303_rinvH2(%rsp),%xmm8    mulpd nb303_rinvM(%rsp),%xmm11    ## move j forces to xmm0-xmm2    movq nb303_faction(%rbp),%rdi        movlpd (%rdi,%rax,8),%xmm0        movlpd 8(%rdi,%rax,8),%xmm1        movlpd 16(%rdi,%rax,8),%xmm2        movhpd (%rdi,%rbx,8),%xmm0        movhpd 8(%rdi,%rbx,8),%xmm1        movhpd 16(%rdi,%rbx,8),%xmm2    movapd %xmm4,%xmm3    movapd %xmm4,%xmm5    movapd %xmm8,%xmm7    movapd %xmm8,%xmm9    movapd %xmm11,%xmm10    movapd %xmm11,%xmm12        mulpd nb303_dxH1(%rsp),%xmm3        mulpd nb303_dyH1(%rsp),%xmm4        mulpd nb303_dzH1(%rsp),%xmm5        mulpd nb303_dxH2(%rsp),%xmm7        mulpd nb303_dyH2(%rsp),%xmm8        mulpd nb303_dzH2(%rsp),%xmm9        mulpd nb303_dxM(%rsp),%xmm10        mulpd nb303_dyM(%rsp),%xmm11        mulpd nb303_dzM(%rsp),%xmm12    addpd %xmm3,%xmm0    addpd %xmm4,%xmm1    addpd %xmm5,%xmm2    addpd nb303_fixH1(%rsp),%xmm3    addpd nb303_fiyH1(%rsp),%xmm4    addpd nb303_fizH1(%rsp),%xmm5    addpd %xmm7,%xmm0    addpd %xmm8,%xmm1    addpd %xmm9,%xmm2    addpd nb303_fixH2(%rsp),%xmm7    addpd nb303_fiyH2(%rsp),%xmm8    addpd nb303_fizH2(%rsp),%xmm9    addpd %xmm10,%xmm0    addpd %xmm11,%xmm1    addpd %xmm12,%xmm2    addpd nb303_fixM(%rsp),%xmm10    addpd nb303_fiyM(%rsp),%xmm11    addpd nb303_fizM(%rsp),%xmm12    movapd %xmm3,nb303_fixH1(%rsp)    movapd %xmm4,nb303_fiyH1(%rsp)    movapd %xmm5,nb303_fizH1(%rsp)    movapd %xmm7,nb303_fixH2(%rsp)    movapd %xmm8,nb303_fiyH2(%rsp)    movapd %xmm9,nb303_fizH2(%rsp)    movapd %xmm10,nb303_fixM(%rsp)    movapd %xmm11,nb303_fiyM(%rsp)    movapd %xmm12,nb303_fizM(%rsp)    ## store back j forces from xmm0-xmm2        movlpd %xmm0,(%rdi,%rax,8)        movlpd %xmm1,8(%rdi,%rax,8)        movlpd %xmm2,16(%rdi,%rax,8)        movhpd %xmm0,(%rdi,%rbx,8)        movhpd %xmm1,8(%rdi,%rbx,8)        movhpd %xmm2,16(%rdi,%rbx,8)        ## should we do one more iteration?         subl $2,nb303_innerk(%rsp)        jl    _nb_kernel303_x86_64_sse2.nb303_checksingle        jmp   _nb_kernel303_x86_64_sse2.nb303_unroll_loop_nb_kernel303_x86_64_sse2.nb303_checksingle:         movl  nb303_innerk(%rsp),%edx        andl  $1,%edx        jnz   _nb_kernel303_x86_64_sse2.nb303_dosingle        jmp   _nb_kernel303_x86_64_sse2.nb303_updateouterdata_nb_kernel303_x86_64_sse2.nb303_dosingle:         movq  nb303_innerjjnr(%rsp),%rdx       ## pointer to jjnr[k]         movl  (%rdx),%eax        movq nb303_charge(%rbp),%rsi     ## base of charge[]         xorpd %xmm3,%xmm3        movlpd (%rsi,%rax,8),%xmm3        movapd %xmm3,%xmm4        mulpd  nb303_iqM(%rsp),%xmm3        mulpd  nb303_iqH(%rsp),%xmm4        movapd  %xmm3,nb303_qqM(%rsp)        movapd  %xmm4,nb303_qqH(%rsp)        movq nb303_pos(%rbp),%rsi        ## base of pos[]         lea  (%rax,%rax,2),%rax     ## replace jnr with j3         ## move coordinates to xmm4-xmm6 & xmm0-xmm2            movlpd (%rsi,%rax,8),%xmm4        movlpd 8(%rsi,%rax,8),%xmm5        movlpd 16(%rsi,%rax,8),%xmm6    movapd %xmm4,%xmm0    movapd %xmm5,%xmm1    movapd %xmm6,%xmm2        ## calc dr         subsd nb303_ixM(%rsp),%xmm4        subsd nb303_iyM(%rsp),%xmm5        subsd nb303_izM(%rsp),%xmm6        ## store dr         movapd %xmm4,nb303_dxM(%rsp)        movapd %xmm5,nb303_dyM(%rsp)        movapd %xmm6,nb303_dzM(%rsp)        ## square it         mulsd %xmm4,%xmm4        mulsd %xmm5,%xmm5        mulsd %xmm6,%xmm6        addsd %xmm5,%xmm4        addsd %xmm6,%xmm4        movapd %xmm4,%xmm7        ## rsqM in xmm7         ## move j coords to xmm4-xmm6         movapd %xmm0,%xmm4        movapd %xmm1,%xmm5        movapd %xmm2,%xmm6        ## calc dr         subsd nb303_ixH1(%rsp),%xmm4        subsd nb303_iyH1(%rsp),%xmm5        subsd nb303_izH1(%rsp),%xmm6        ## store dr         movapd %xmm4,nb303_dxH1(%rsp)        movapd %xmm5,nb303_dyH1(%rsp)        movapd %xmm6,nb303_dzH1(%rsp)        ## square it         mulsd %xmm4,%xmm4        mulsd %xmm5,%xmm5        mulsd %xmm6,%xmm6        addsd %xmm5,%xmm6        addsd %xmm4,%xmm6        ## rsqH1 in xmm6         ## move j coords to xmm3-xmm5         movapd %xmm0,%xmm3        movapd %xmm1,%xmm4        movapd %xmm2,%xmm5        ## calc dr         subsd nb303_ixH2(%rsp),%xmm3        subsd nb303_iyH2(%rsp),%xmm4        subsd nb303_izH2(%rsp),%xmm5        ## store dr         movapd %xmm3,nb303_dxH2(%rsp)        movapd %xmm4,nb303_dyH2(%rsp)        movapd %xmm5,nb303_dzH2(%rsp)        ## square it         mulsd %xmm3,%xmm3        mulsd %xmm4,%xmm4        mulsd %xmm5,%xmm5        addsd %xmm4,%xmm5        addsd %xmm3,%xmm5        ## rsqH2 in xmm5, rsqH1 in xmm6, rsqM in xmm7         ## start with rsqM - put seed in xmm2         cvtsd2ss %xmm7,%xmm2        rsqrtss %xmm2,%xmm2        cvtss2sd %xmm2,%xmm2        movapd  %xmm2,%xmm3        mulsd   %xmm2,%xmm2        movapd  nb303_three(%rsp),%xmm4        mulsd   %xmm7,%xmm2     ## rsq*lu*lu         subsd   %xmm2,%xmm4     ## 30-rsq*lu*lu         mulsd   %xmm3,%xmm4     ## lu*(3-rsq*lu*lu)         mulsd   nb303_half(%rsp),%xmm4   ## iter1 ( new lu)         movapd %xmm7,%xmm2        movapd %xmm4,%xmm3        mulsd %xmm4,%xmm4       ## lu*lu         mulsd %xmm4,%xmm2       ## rsq*lu*lu         movapd nb303_three(%rsp),%xmm4        subsd %xmm2,%xmm4       ## 3-rsq*lu*lu         mulsd %xmm3,%xmm4       ## lu*( 3-rsq*lu*lu)         mulsd nb303_half(%rsp),%xmm4   ## rinv         movapd  %xmm4,nb303_rinvM(%rsp)         ## rinvM in xmm4         mulsd   %xmm4,%xmm7        movapd  %xmm7,nb303_rM(%rsp)    ## r in xmm7         ## rsqH1 - seed in xmm2         cvtsd2ss %xmm6,%xmm2        rsqrtss %xmm2,%xmm2        cvtss2sd %xmm2,%xmm2        movapd  %xmm2,%xmm3        mulsd   %xmm2,%xmm2        movapd  nb303_three(%rsp),%xmm4        mulsd   %xmm6,%xmm2     ## rsq*lu*lu         subsd   %xmm2,%xmm4     ## 30-rsq*lu*lu         mulsd   %xmm3,%xmm4     ## lu*(3-rsq*lu*lu)         mulsd   nb303_half(%rsp),%xmm4   ## iter1 ( new lu)         movapd %xmm6,%xmm2        movapd %xmm4,%xmm3        mulsd %xmm4,%xmm4       ## lu*lu         mulsd %xmm4,%xmm2       ## rsq*lu*lu         movapd nb303_three(%rsp),%xmm4        subsd %xmm2,%xmm4       ## 3-rsq*lu*lu         mulsd %xmm3,%xmm4       ## lu*( 3-rsq*lu*lu)         mulsd nb303_half(%rsp),%xmm4   ## rinv         movapd %xmm4,nb303_rinvH1(%rsp)         ## rinvH1         mulsd  %xmm4,%xmm6        movapd %xmm6,nb303_rH1(%rsp)    ## rH1         ## rsqH2 - seed in xmm2         cvtsd2ss %xmm5,%xmm2        rsqrtss %xmm2,%xmm2        cvtss2sd %xmm2,%xmm2        movapd  %xmm2,%xmm3        mulsd   %xmm2,%xmm2        movapd  nb303_three(%rsp),%xmm4        mulsd   %xmm5,%xmm2     ## rsq*lu*lu         subsd   %xmm2,%xmm4     ## 30-rsq*lu*lu         mulsd   %xmm3,%xmm4     ## lu*(3-rsq*lu*lu)         mulsd   nb303_half(%rsp),%xmm4   ## iter1 ( new lu)         movapd %xmm5,%xmm2        movapd %xmm4,%xmm3        mulsd %xmm4,%xmm4       ## lu*lu         mulsd %xmm4,%xmm2       ## rsq*lu*lu         movapd nb303_three(%rsp),%xmm4        subsd %xmm2,%xmm4       ## 3-rsq*lu*lu         mulsd %xmm3,%xmm4       ## lu*( 3-rsq*lu*lu)         mulsd nb303_half(%rsp),%xmm4   ## rinv         movapd %xmm4,nb303_rinvH2(%rsp)   ## rinv         mulsd %xmm4,%xmm5        movapd %xmm5,nb303_rH2(%rsp)   ## r         ## do M interactions         ## rM is still in xmm7 

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?