nb_kernel112_x86_64_sse.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 2,220 行 · 第 1/5 页

S
2,220
字号
            subps xmm0, [rsp + nb112_ixO]    subps xmm1, [rsp + nb112_iyO]    subps xmm2, [rsp + nb112_izO]    subps xmm3, [rsp + nb112_ixH1]    subps xmm4, [rsp + nb112_iyH1]    subps xmm5, [rsp + nb112_izH1]    subps xmm6, [rsp + nb112_ixH2]    subps xmm7, [rsp + nb112_iyH2]    subps xmm8, [rsp + nb112_izH2]    	movaps [rsp + nb112_dxOO], xmm0	movaps [rsp + nb112_dyOO], xmm1	movaps [rsp + nb112_dzOO], xmm2	mulps  xmm0, xmm0	mulps  xmm1, xmm1	mulps  xmm2, xmm2	movaps [rsp + nb112_dxH1O], xmm3	movaps [rsp + nb112_dyH1O], xmm4	movaps [rsp + nb112_dzH1O], xmm5	mulps  xmm3, xmm3	mulps  xmm4, xmm4	mulps  xmm5, xmm5	movaps [rsp + nb112_dxH2O], xmm6	movaps [rsp + nb112_dyH2O], xmm7	movaps [rsp + nb112_dzH2O], xmm8	mulps  xmm6, xmm6	mulps  xmm7, xmm7	mulps  xmm8, xmm8	addps  xmm0, xmm1	addps  xmm0, xmm2	addps  xmm3, xmm4	addps  xmm3, xmm5    addps  xmm6, xmm7    addps  xmm6, xmm8	;# start doing invsqrt for jO atoms	rsqrtps xmm1, xmm0	rsqrtps xmm4, xmm3    rsqrtps xmm7, xmm6		movaps  xmm2, xmm1	movaps  xmm5, xmm4    movaps  xmm8, xmm7    	mulps   xmm1, xmm1 ;# lu*lu	mulps   xmm4, xmm4 ;# lu*lu    mulps   xmm7, xmm7 ;# lu*lu			movaps  xmm9, [rsp + nb112_three]	movaps  xmm10, xmm9    movaps  xmm11, xmm9	mulps   xmm1, xmm0 ;# rsq*lu*lu	mulps   xmm4, xmm3 ;# rsq*lu*lu     mulps   xmm7, xmm6 ;# rsq*lu*lu		subps   xmm9, xmm1	subps   xmm10, xmm4    subps   xmm11, xmm7 ;# 3-rsq*lu*lu	mulps   xmm9, xmm2	mulps   xmm10, xmm5    mulps   xmm11, xmm8 ;# lu*(3-rsq*lu*lu)	movaps  xmm0, [rsp + nb112_half]	mulps   xmm9, xmm0  ;# rinvOO 	mulps   xmm10, xmm0 ;# rinvH1O    mulps   xmm11, xmm0 ;# rinvH2O		;# O interactions     movaps xmm0, xmm9    movaps xmm1, xmm10    movaps xmm2, xmm11    mulps  xmm9, xmm9    ;# rinvsq    mulps  xmm10, xmm10    mulps  xmm11, xmm11    movaps xmm12, xmm9    mulps  xmm12, xmm12 ;# rinv4    mulps  xmm12, xmm9  ;# rinv6    mulps  xmm0, [rsp + nb112_qqOO]     mulps  xmm1, [rsp + nb112_qqOH]     mulps  xmm2, [rsp + nb112_qqOH]     movaps xmm13, xmm12 ;# rinv6    mulps xmm12, xmm12 ;# rinv12	mulps  xmm13, [rsp + nb112_c6]	mulps  xmm12, [rsp + nb112_c12]    movaps xmm14, xmm12    subps  xmm14, xmm13    	addps  xmm14, [rsp + nb112_Vvdwtot]	mulps  xmm13, [rsp + nb112_six]	mulps  xmm12, [rsp + nb112_twelve]	movaps [rsp + nb112_Vvdwtot], xmm14    subps  xmm12, xmm13 ;# LJ fscal            addps  xmm12, xmm0        mulps  xmm9, xmm12    mulps  xmm10, xmm1    mulps  xmm11, xmm2        addps xmm0, [rsp + nb112_vctot]     addps xmm1, xmm2    addps xmm0, xmm1    movaps [rsp + nb112_vctot], xmm0    	;# move j O forces to local temp variables     movlps xmm0, [rdi + rax*4] ;# jxOa jyOa  -   -    movlps xmm1, [rdi + rcx*4] ;# jxOc jyOc  -   -    movhps xmm0, [rdi + rbx*4] ;# jxOa jyOa jxOb jyOb     movhps xmm1, [rdi + rdx*4] ;# jxOc jyOc jxOd jyOd     movss  xmm2, [rdi + rax*4 + 8] ;# jzOa  -  -  -    movss  xmm3, [rdi + rcx*4 + 8] ;# jzOc  -  -  -    movhps xmm2, [rdi + rbx*4 + 8] ;# jzOa  -  jzOb  -    movhps xmm3, [rdi + rdx*4 + 8] ;# jzOc  -  jzOd -        shufps xmm2, xmm3,  136  ;# 10001000 => jzOa jzOb jzOc jzOd    ;# xmm0: jxOa jyOa jxOb jyOb     ;# xmm1: jxOc jyOc jxOd jyOd    ;# xmm2: jzOa jzOb jzOc jzOd    movaps xmm7, xmm9    movaps xmm8, xmm9    movaps xmm13, xmm11    movaps xmm14, xmm11    movaps xmm15, xmm11    movaps xmm11, xmm10    movaps xmm12, xmm10	mulps xmm7, [rsp + nb112_dxOO]	mulps xmm8, [rsp + nb112_dyOO]	mulps xmm9, [rsp + nb112_dzOO]	mulps xmm10, [rsp + nb112_dxH1O]	mulps xmm11, [rsp + nb112_dyH1O]	mulps xmm12, [rsp + nb112_dzH1O]	mulps xmm13, [rsp + nb112_dxH2O]	mulps xmm14, [rsp + nb112_dyH2O]	mulps xmm15, [rsp + nb112_dzH2O]    movaps xmm3, xmm7    movaps xmm4, xmm8    addps xmm2, xmm9    addps xmm7, [rsp + nb112_fixO]    addps xmm8, [rsp + nb112_fiyO]    addps xmm9, [rsp + nb112_fizO]    addps xmm3, xmm10    addps xmm4, xmm11    addps xmm2, xmm12    addps xmm10, [rsp + nb112_fixH1]    addps xmm11, [rsp + nb112_fiyH1]    addps xmm12, [rsp + nb112_fizH1]    addps xmm3, xmm13    addps xmm4, xmm14    addps xmm2, xmm15    addps xmm13, [rsp + nb112_fixH2]    addps xmm14, [rsp + nb112_fiyH2]    addps xmm15, [rsp + nb112_fizH2]    movaps [rsp + nb112_fixO], xmm7    movaps [rsp + nb112_fiyO], xmm8    movaps [rsp + nb112_fizO], xmm9    movaps [rsp + nb112_fixH1], xmm10    movaps [rsp + nb112_fiyH1], xmm11    movaps [rsp + nb112_fizH1], xmm12    movaps [rsp + nb112_fixH2], xmm13    movaps [rsp + nb112_fiyH2], xmm14    movaps [rsp + nb112_fizH2], xmm15    ;# xmm3 = fOx , xmm4 = fOy, xmm5=fOz    movaps xmm5, xmm3    unpcklps xmm3, xmm4    unpckhps xmm5, xmm4        addps xmm0, xmm3    addps xmm1, xmm5    movhlps  xmm3, xmm2 ;# fOzc fOzd        movlps [rdi + rax*4], xmm0    movhps [rdi + rbx*4], xmm0    movlps [rdi + rcx*4], xmm1    movhps [rdi + rdx*4], xmm1    movss  [rdi + rax*4 + 8], xmm2    movss  [rdi + rcx*4 + 8], xmm3    shufps xmm2, xmm2, 1    shufps xmm3, xmm3, 1    movss  [rdi + rbx*4 + 8], xmm2    movss  [rdi + rdx*4 + 8], xmm3	;# move j H1 coordinates to local temp variables     movlps xmm0, [rsi + rax*4 + 12] ;# jxH1a jyH1a  -   -    movlps xmm1, [rsi + rcx*4 + 12] ;# jxH1c jyH1c  -   -    movhps xmm0, [rsi + rbx*4 + 12] ;# jxH1a jyH1a jxH1b jyH1b     movhps xmm1, [rsi + rdx*4 + 12] ;# jxH1c jyH1c jxH1d jyH1d     movss  xmm2, [rsi + rax*4 + 20] ;# jzH1a  -  -  -    movss  xmm3, [rsi + rcx*4 + 20] ;# jzH1c  -  -  -    movhps xmm2, [rsi + rbx*4 + 20] ;# jzH1a  -  jzH1b  -    movhps xmm3, [rsi + rdx*4 + 20] ;# jzH1c  -  jzH1d -        movaps xmm4, xmm0    unpcklps xmm0, xmm1  ;# jxH1a jxH1c jyH1a jyH1c            unpckhps xmm4, xmm1  ;# jxH1b jxH1d jyH1b jyH1d    movaps xmm1, xmm0    unpcklps xmm0, xmm4 ;# x    unpckhps xmm1, xmm4 ;# y    shufps   xmm2, xmm3,  136  ;# 10001000 => jzH1a jzH1b jzH1c jzH1d    ;# xmm0 = H1x    ;# xmm1 = H1y    ;# xmm2 = H1z            movaps xmm3, xmm0    movaps xmm4, xmm1    movaps xmm5, xmm2    movaps xmm6, xmm0    movaps xmm7, xmm1    movaps xmm8, xmm2        subps xmm0, [rsp + nb112_ixO]    subps xmm1, [rsp + nb112_iyO]    subps xmm2, [rsp + nb112_izO]    subps xmm3, [rsp + nb112_ixH1]    subps xmm4, [rsp + nb112_iyH1]    subps xmm5, [rsp + nb112_izH1]    subps xmm6, [rsp + nb112_ixH2]    subps xmm7, [rsp + nb112_iyH2]    subps xmm8, [rsp + nb112_izH2]    	movaps [rsp + nb112_dxOH1], xmm0	movaps [rsp + nb112_dyOH1], xmm1	movaps [rsp + nb112_dzOH1], xmm2	mulps  xmm0, xmm0	mulps  xmm1, xmm1	mulps  xmm2, xmm2	movaps [rsp + nb112_dxH1H1], xmm3	movaps [rsp + nb112_dyH1H1], xmm4	movaps [rsp + nb112_dzH1H1], xmm5	mulps  xmm3, xmm3	mulps  xmm4, xmm4	mulps  xmm5, xmm5	movaps [rsp + nb112_dxH2H1], xmm6	movaps [rsp + nb112_dyH2H1], xmm7	movaps [rsp + nb112_dzH2H1], xmm8	mulps  xmm6, xmm6	mulps  xmm7, xmm7	mulps  xmm8, xmm8	addps  xmm0, xmm1	addps  xmm0, xmm2	addps  xmm3, xmm4	addps  xmm3, xmm5    addps  xmm6, xmm7    addps  xmm6, xmm8	;# start doing invsqrt for jH1 atoms	rsqrtps xmm1, xmm0	rsqrtps xmm4, xmm3    rsqrtps xmm7, xmm6		movaps  xmm2, xmm1	movaps  xmm5, xmm4    movaps  xmm8, xmm7        	mulps   xmm1, xmm1 ;# lu*lu	mulps   xmm4, xmm4 ;# lu*lu    mulps   xmm7, xmm7 ;# lu*lu			movaps  xmm9, [rsp + nb112_three]	movaps  xmm10, xmm9    movaps  xmm11, xmm9	mulps   xmm1, xmm0 ;# rsq*lu*lu	mulps   xmm4, xmm3 ;# rsq*lu*lu     mulps   xmm7, xmm6 ;# rsq*lu*lu		subps   xmm9, xmm1	subps   xmm10, xmm4    subps   xmm11, xmm7 ;# 3-rsq*lu*lu	mulps   xmm9, xmm2	mulps   xmm10, xmm5    mulps   xmm11, xmm8 ;# lu*(3-rsq*lu*lu)	movaps  xmm0, [rsp + nb112_half]	mulps   xmm9, xmm0  ;# rinvOH1	mulps   xmm10, xmm0 ;# rinvH1H1    mulps   xmm11, xmm0 ;# rinvH2H1		;# H1 interactions     movaps xmm0, xmm9    movaps xmm1, xmm10    movaps xmm2, xmm11    mulps  xmm9, xmm9    mulps  xmm10, xmm10    mulps  xmm11, xmm11    mulps  xmm0, [rsp + nb112_qqOH]     mulps  xmm1, [rsp + nb112_qqHH]     mulps  xmm2, [rsp + nb112_qqHH]     mulps  xmm9, xmm0    mulps  xmm10, xmm1    mulps  xmm11, xmm2        addps xmm0, [rsp + nb112_vctot]     addps xmm1, xmm2    addps xmm0, xmm1    movaps [rsp + nb112_vctot], xmm0    	;# move j H1 forces to local temp variables     movlps xmm0, [rdi + rax*4 + 12] ;# jxH1a jyH1a  -   -    movlps xmm1, [rdi + rcx*4 + 12] ;# jxH1c jyH1c  -   -    movhps xmm0, [rdi + rbx*4 + 12] ;# jxH1a jyH1a jxH1b jyH1b     movhps xmm1, [rdi + rdx*4 + 12] ;# jxH1c jyH1c jxH1d jyH1d     movss  xmm2, [rdi + rax*4 + 20] ;# jzH1a  -  -  -    movss  xmm3, [rdi + rcx*4 + 20] ;# jzH1c  -  -  -    movhps xmm2, [rdi + rbx*4 + 20] ;# jzH1a  -  jzH1b  -    movhps xmm3, [rdi + rdx*4 + 20] ;# jzH1c  -  jzH1d -        shufps xmm2, xmm3,  136  ;# 10001000 => jzH1a jzH1b jzH1c jzH1d    ;# xmm0: jxH1a jyH1a jxH1b jyH1b     ;# xmm1: jxH1c jyH1c jxH1d jyH1d    ;# xmm2: jzH1a jzH1b jzH1c jzH1d    movaps xmm7, xmm9    movaps xmm8, xmm9    movaps xmm13, xmm11    movaps xmm14, xmm11    movaps xmm15, xmm11    movaps xmm11, xmm10    movaps xmm12, xmm10	mulps xmm7, [rsp + nb112_dxOH1]	mulps xmm8, [rsp + nb112_dyOH1]	mulps xmm9, [rsp + nb112_dzOH1]	mulps xmm10, [rsp + nb112_dxH1H1]	mulps xmm11, [rsp + nb112_dyH1H1]	mulps xmm12, [rsp + nb112_dzH1H1]	mulps xmm13, [rsp + nb112_dxH2H1]	mulps xmm14, [rsp + nb112_dyH2H1]	mulps xmm15, [rsp + nb112_dzH2H1]    movaps xmm3, xmm7    movaps xmm4, xmm8    addps xmm2, xmm9    addps xmm7, [rsp + nb112_fixO]    addps xmm8, [rsp + nb112_fiyO]    addps xmm9, [rsp + nb112_fizO]    addps xmm3, xmm10    addps xmm4, xmm11    addps xmm2, xmm12    addps xmm10, [rsp + nb112_fixH1]    addps xmm11, [rsp + nb112_fiyH1]    addps xmm12, [rsp + nb112_fizH1]    addps xmm3, xmm13    addps xmm4, xmm14    addps xmm2, xmm15    addps xmm13, [rsp + nb112_fixH2]    addps xmm14, [rsp + nb112_fiyH2]    addps xmm15, [rsp + nb112_fizH2]        movaps [rsp + nb112_fixO], xmm7    movaps [rsp + nb112_fiyO], xmm8    movaps [rsp + nb112_fizO], xmm9    movaps [rsp + nb112_fixH1], xmm10    movaps [rsp + nb112_fiyH1], xmm11    movaps [rsp + nb112_fizH1], xmm12    movaps [rsp + nb112_fixH2], xmm13    movaps [rsp + nb112_fiyH2], xmm14    movaps [rsp + nb112_fizH2], xmm15    ;# xmm0 = fH1x    ;# xmm1 = fH1y    ;# xmm2 = fH1z    movaps xmm5, xmm3    unpcklps xmm3, xmm4    unpckhps xmm5, xmm4        addps xmm0, xmm3    addps xmm1, xmm5    movhlps  xmm3, xmm2 ;# fH1zc fH1zd        movlps [rdi + rax*4 + 12], xmm0    movhps [rdi + rbx*4 + 12], xmm0    movlps [rdi + rcx*4 + 12], xmm1    movhps [rdi + rdx*4 + 12], xmm1    movss  [rdi + rax*4 + 20], xmm2    movss  [rdi + rcx*4 + 20], xmm3    shufps xmm2, xmm2, 1    shufps xmm3, xmm3, 1    movss  [rdi + rbx*4 + 20], xmm2    movss  [rdi + rdx*4 + 20], xmm3	;# move j H2 coordinates to local temp variables     movlps xmm0, [rsi + rax*4 + 24] ;# jxH2a jyH2a  -   -    movlps xmm1, [rsi + rcx*4 + 24] ;# jxH2c jyH2c  -   -    movhps xmm0, [rsi + rbx*4 + 24] ;# jxH2a jyH2a jxH2b jyH2b     movhps xmm1, [rsi + rdx*4 + 24] ;# jxH2c jyH2c jxH2d jyH2d     movss  xmm2, [rsi + rax*4 + 32] ;# jzH2a  -  -  -    movss  xmm3, [rsi + rcx*4 + 32] ;# jzH2c  -  -  -    movss  xmm5, [rsi + rbx*4 + 32] ;# jzH2b  -  -  -    movss  xmm6, [rsi + rdx*4 + 32] ;# jzH2d  -  -  -    movlhps xmm2, xmm5 ;# jzH2a  -  jzH2b  -    movlhps xmm3, xmm6 ;# jzH2c  -  jzH2d -        movaps xmm4, xmm0    unpcklps xmm0, xmm1  ;# jxH2a jxH2c jyH2a jyH2c            unpckhps xmm4, xmm1  ;# jxH2b jxH2d jyH2b jyH2d    movaps xmm1, xmm0    unpcklps xmm0, xmm4 ;# x    unpckhps xmm1, xmm4 ;# y    shufps   xmm2, xmm3,  136  ;# 10001000 => jzH2a jzH2b jzH2c jzH2d    ;# xmm0 = H2x    ;# xmm1 = H2y    ;# xmm2 = H2z            movaps xmm3, xmm0    movaps xmm4, xmm1    movaps xmm5, xmm2    movaps xmm6, xmm0    movaps xmm7, xmm1    movaps xmm8, xmm2        subps xmm0, [rsp + nb112_ixO]    subps xmm1, [rsp + nb112_iyO]    subps xmm2, [rsp + nb112_izO]    subps xmm3, [rsp + nb112_ixH1]    subps xmm4, [rsp + nb112_iyH1]

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?