nb_kernel112_x86_64_sse.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 2,220 行 · 第 1/5 页

S
2,220
字号
    subps xmm5, [rsp + nb112_izH1]    subps xmm6, [rsp + nb112_ixH2]    subps xmm7, [rsp + nb112_iyH2]    subps xmm8, [rsp + nb112_izH2]    	movaps [rsp + nb112_dxOH2], xmm0	movaps [rsp + nb112_dyOH2], xmm1	movaps [rsp + nb112_dzOH2], xmm2	mulps  xmm0, xmm0	mulps  xmm1, xmm1	mulps  xmm2, xmm2	movaps [rsp + nb112_dxH1H2], xmm3	movaps [rsp + nb112_dyH1H2], xmm4	movaps [rsp + nb112_dzH1H2], xmm5	mulps  xmm3, xmm3	mulps  xmm4, xmm4	mulps  xmm5, xmm5	movaps [rsp + nb112_dxH2H2], xmm6	movaps [rsp + nb112_dyH2H2], xmm7	movaps [rsp + nb112_dzH2H2], xmm8	mulps  xmm6, xmm6	mulps  xmm7, xmm7	mulps  xmm8, xmm8	addps  xmm0, xmm1	addps  xmm0, xmm2	addps  xmm3, xmm4	addps  xmm3, xmm5    addps  xmm6, xmm7    addps  xmm6, xmm8	;# start doing invsqrt for jH2 atoms	rsqrtps xmm1, xmm0	rsqrtps xmm4, xmm3    rsqrtps xmm7, xmm6		movaps  xmm2, xmm1	movaps  xmm5, xmm4    movaps  xmm8, xmm7        	mulps   xmm1, xmm1 ;# lu*lu	mulps   xmm4, xmm4 ;# lu*lu    mulps   xmm7, xmm7 ;# lu*lu			movaps  xmm9, [rsp + nb112_three]	movaps  xmm10, xmm9    movaps  xmm11, xmm9	mulps   xmm1, xmm0 ;# rsq*lu*lu	mulps   xmm4, xmm3 ;# rsq*lu*lu     mulps   xmm7, xmm6 ;# rsq*lu*lu		subps   xmm9, xmm1	subps   xmm10, xmm4    subps   xmm11, xmm7 ;# 3-rsq*lu*lu	mulps   xmm9, xmm2	mulps   xmm10, xmm5    mulps   xmm11, xmm8 ;# lu*(3-rsq*lu*lu)	movaps  xmm0, [rsp + nb112_half]	mulps   xmm9, xmm0  ;# rinvOH2 	mulps   xmm10, xmm0 ;# rinvH1H2    mulps   xmm11, xmm0 ;# rinvH2H2		;# H2 interactions     movaps xmm0, xmm9    movaps xmm1, xmm10    movaps xmm2, xmm11    mulps  xmm9, xmm9    mulps  xmm10, xmm10    mulps  xmm11, xmm11    mulps  xmm0, [rsp + nb112_qqOH]     mulps  xmm1, [rsp + nb112_qqHH]     mulps  xmm2, [rsp + nb112_qqHH]     mulps  xmm9, xmm0    mulps  xmm10, xmm1    mulps  xmm11, xmm2        addps xmm0, [rsp + nb112_vctot]     addps xmm1, xmm2    addps xmm0, xmm1    movaps [rsp + nb112_vctot], xmm0    	;# move j H2 forces to local temp variables     movlps xmm0, [rdi + rax*4 + 24] ;# jxH2a jyH2a  -   -    movlps xmm1, [rdi + rcx*4 + 24] ;# jxH2c jyH2c  -   -    movhps xmm0, [rdi + rbx*4 + 24] ;# jxH2a jyH2a jxH2b jyH2b     movhps xmm1, [rdi + rdx*4 + 24] ;# jxH2c jyH2c jxH2d jyH2d     movss  xmm2, [rdi + rax*4 + 32] ;# jzH2a  -  -  -    movss  xmm3, [rdi + rcx*4 + 32] ;# jzH2c  -  -  -    movss  xmm7, [rdi + rbx*4 + 32] ;# jzH2b  -  -  -    movss  xmm8, [rdi + rdx*4 + 32] ;# jzH2d  -  -  -    movlhps xmm2, xmm7 ;# jzH2a  -  jzH2b  -    movlhps xmm3, xmm8 ;# jzH2c  -  jzH2d -        shufps xmm2, xmm3,  136  ;# 10001000 => jzH2a jzH2b jzH2c jzH2d    ;# xmm0: jxH2a jyH2a jxH2b jyH2b     ;# xmm1: jxH2c jyH2c jxH2d jyH2d    ;# xmm2: jzH2a jzH2b jzH2c jzH2d    movaps xmm7, xmm9    movaps xmm8, xmm9    movaps xmm13, xmm11    movaps xmm14, xmm11    movaps xmm15, xmm11    movaps xmm11, xmm10    movaps xmm12, xmm10	mulps xmm7, [rsp + nb112_dxOH2]	mulps xmm8, [rsp + nb112_dyOH2]	mulps xmm9, [rsp + nb112_dzOH2]	mulps xmm10, [rsp + nb112_dxH1H2]	mulps xmm11, [rsp + nb112_dyH1H2]	mulps xmm12, [rsp + nb112_dzH1H2]	mulps xmm13, [rsp + nb112_dxH2H2]	mulps xmm14, [rsp + nb112_dyH2H2]	mulps xmm15, [rsp + nb112_dzH2H2]    movaps xmm3, xmm7    movaps xmm4, xmm8    addps xmm2, xmm9    addps xmm7, [rsp + nb112_fixO]    addps xmm8, [rsp + nb112_fiyO]    addps xmm9, [rsp + nb112_fizO]    addps xmm3, xmm10    addps xmm4, xmm11    addps xmm2, xmm12    addps xmm10, [rsp + nb112_fixH1]    addps xmm11, [rsp + nb112_fiyH1]    addps xmm12, [rsp + nb112_fizH1]    addps xmm3, xmm13    addps xmm4, xmm14    addps xmm2, xmm15    addps xmm13, [rsp + nb112_fixH2]    addps xmm14, [rsp + nb112_fiyH2]    addps xmm15, [rsp + nb112_fizH2]        movaps [rsp + nb112_fixO], xmm7    movaps [rsp + nb112_fiyO], xmm8    movaps [rsp + nb112_fizO], xmm9    movaps [rsp + nb112_fixH1], xmm10    movaps [rsp + nb112_fiyH1], xmm11    movaps [rsp + nb112_fizH1], xmm12    movaps [rsp + nb112_fixH2], xmm13    movaps [rsp + nb112_fiyH2], xmm14    movaps [rsp + nb112_fizH2], xmm15    ;# xmm0 = fH2x    ;# xmm1 = fH2y    ;# xmm2 = fH2z    movaps xmm5, xmm3    unpcklps xmm3, xmm4    unpckhps xmm5, xmm4        addps xmm0, xmm3    addps xmm1, xmm5    movhlps  xmm3, xmm2 ;# fH2zc fH2zd        movlps [rdi + rax*4 + 24], xmm0    movhps [rdi + rbx*4 + 24], xmm0    movlps [rdi + rcx*4 + 24], xmm1    movhps [rdi + rdx*4 + 24], xmm1    movss  [rdi + rax*4 + 32], xmm2    movss  [rdi + rcx*4 + 32], xmm3    shufps xmm2, xmm2, 1    shufps xmm3, xmm3, 1    movss  [rdi + rbx*4 + 32], xmm2    movss  [rdi + rdx*4 + 32], xmm3		;# should we do one more iteration? 	sub dword ptr [rsp + nb112_innerk],  4	jl    .nb112_single_check	jmp   .nb112_unroll_loop.nb112_single_check:	add dword ptr [rsp + nb112_innerk],  4	jnz   .nb112_single_loop	jmp   .nb112_updateouterdata.nb112_single_loop:	mov   rdx, [rsp + nb112_innerjjnr]     ;# pointer to jjnr[k] 	mov   eax, [rdx]		add qword ptr [rsp + nb112_innerjjnr],  4		mov rsi, [rbp + nb112_pos]	lea   rax, [rax + rax*2]  	;# fetch j coordinates 	xorps xmm0, xmm0	xorps xmm1, xmm1	xorps xmm2, xmm2		movss xmm0, [rsi + rax*4]		;# jxO  -  -  -	movss xmm1, [rsi + rax*4 + 4]		;# jyO  -  -  -	movss xmm2, [rsi + rax*4 + 8]		;# jzO  -  -  -  	movlps xmm6, [rsi + rax*4 + 12]		;# xmm6 = jxH1 jyH1   -    -	movss  xmm7, [rsi + rax*4 + 20]		;# xmm7 = jzH1   -    -    - 	movhps xmm6, [rsi + rax*4 + 24]		;# xmm6 = jxH1 jyH1 jxH2 jyH2	movss  xmm5, [rsi + rax*4 + 32]		;# xmm5 = jzH2   -    -    -		;# have all coords, time for some shuffling.	shufps xmm6, xmm6, 216 ;# 11011000	;# xmm6 = jxH1 jxH2 jyH1 jyH2 	unpcklps xmm7, xmm5			;# xmm7 = jzH1 jzH2   -    -	movlhps xmm0, xmm6			;# xmm0 = jxO   0   jxH1 jxH2 	shufps  xmm1, xmm6, 228 ;# 11100100	;# xmm1 = jyO   0   jyH1 jyH2 	shufps  xmm2, xmm7, 68  ;# 01000100	;# xmm2 = jzO   0   jzH1 jzH2	;# store all j coordinates in jO  	movaps [rsp + nb112_jxO], xmm0	movaps [rsp + nb112_jyO], xmm1	movaps [rsp + nb112_jzO], xmm2	subps  xmm0, [rsp + nb112_ixO]	subps  xmm1, [rsp + nb112_iyO]	subps  xmm2, [rsp + nb112_izO]	movaps [rsp + nb112_dxOO], xmm0	movaps [rsp + nb112_dyOO], xmm1	movaps [rsp + nb112_dzOO], xmm2	mulps xmm0, xmm0	mulps xmm1, xmm1	mulps xmm2, xmm2	addps xmm0, xmm1	addps xmm0, xmm2	;# have rsq in xmm0 		;# do invsqrt 	rsqrtps xmm1, xmm0	movaps  xmm2, xmm1		mulps   xmm1, xmm1	movaps  xmm3, [rsp + nb112_three]	mulps   xmm1, xmm0	subps   xmm3, xmm1	mulps   xmm3, xmm2								mulps   xmm3, [rsp + nb112_half] ;# rinv iO - j water 	xorps   xmm1, xmm1	movaps  xmm0, xmm3	xorps   xmm4, xmm4	mulps   xmm0, xmm0	;# xmm0=rinvsq 	;# fetch charges to xmm4 (temporary) 	movss   xmm4, [rsp + nb112_qqOO]	movss   xmm1, xmm0	movhps  xmm4, [rsp + nb112_qqOH]	mulss   xmm1, xmm0	mulps   xmm3, xmm4	;# xmm3=vcoul 	mulss   xmm1, xmm0	;# xmm1(0)=rinvsix 	movaps  xmm2, xmm1	;# zero everything else in xmm2 	mulss   xmm2, xmm2	;# xmm2=rinvtwelve 	mulss   xmm1, [rsp + nb112_c6]	mulss   xmm2, [rsp + nb112_c12]	movaps  xmm4, xmm2	subss   xmm4, xmm1	;# Vvdwtot=Vvdw12-Vvdw6 	addps   xmm4, [rsp + nb112_Vvdwtot]	mulss   xmm1, [rsp + nb112_six]	mulss   xmm2, [rsp + nb112_twelve]		movaps  [rsp + nb112_Vvdwtot], xmm4	subss   xmm2, xmm1	;# fsD+ fsR 	addps   xmm2, xmm3	;# fsC+ fsD+ fsR 	addps   xmm3, [rsp + nb112_vctot]	mulps   xmm0, xmm2	;# total fscal 	movaps  [rsp + nb112_vctot], xmm3		movaps  xmm1, xmm0	movaps  xmm2, xmm0	mulps   xmm0, [rsp + nb112_dxOO]	mulps   xmm1, [rsp + nb112_dyOO]	mulps   xmm2, [rsp + nb112_dzOO]	;# initial update for j forces 	xorps   xmm3, xmm3	xorps   xmm4, xmm4	xorps   xmm5, xmm5	addps   xmm3, xmm0	addps   xmm4, xmm1	addps   xmm5, xmm2	movaps  [rsp + nb112_fjxO], xmm3	movaps  [rsp + nb112_fjyO], xmm4	movaps  [rsp + nb112_fjzO], xmm5	addps   xmm0, [rsp + nb112_fixO]	addps   xmm1, [rsp + nb112_fiyO]	addps   xmm2, [rsp + nb112_fizO]	movaps  [rsp + nb112_fixO], xmm0	movaps  [rsp + nb112_fiyO], xmm1	movaps  [rsp + nb112_fizO], xmm2		;# done with i O Now do i H1 & H2 simultaneously first get i particle coords:     movaps  xmm0, [rsp + nb112_jxO]    movaps  xmm1, [rsp + nb112_jyO]    movaps  xmm2, [rsp + nb112_jzO]    movaps  xmm3, xmm0    movaps  xmm4, xmm1    movaps  xmm5, xmm2	subps  xmm0, [rsp + nb112_ixH1]	subps  xmm1, [rsp + nb112_iyH1]	subps  xmm2, [rsp + nb112_izH1]	subps  xmm3, [rsp + nb112_ixH2]	subps  xmm4, [rsp + nb112_iyH2]	subps  xmm5, [rsp + nb112_izH2]    movaps [rsp + nb112_dxH1O], xmm0	movaps [rsp + nb112_dyH1O], xmm1	movaps [rsp + nb112_dzH1O], xmm2	movaps [rsp + nb112_dxH2O], xmm3	movaps [rsp + nb112_dyH2O], xmm4	movaps [rsp + nb112_dzH2O], xmm5	mulps xmm0, xmm0	mulps xmm1, xmm1	mulps xmm2, xmm2	mulps xmm3, xmm3	mulps xmm4, xmm4	mulps xmm5, xmm5	addps xmm0, xmm1	addps xmm4, xmm3	addps xmm0, xmm2	;# have rsqH1 in xmm0 	addps xmm4, xmm5	;# have rsqH2 in xmm4 	;# do invsqrt 	rsqrtps xmm1, xmm0	rsqrtps xmm5, xmm4	movaps  xmm2, xmm1   ;# do coulomb interaction 	movaps  xmm6, xmm5	mulps   xmm1, xmm1	mulps   xmm5, xmm5	movaps  xmm3, [rsp + nb112_three]	movaps  xmm7, xmm3	mulps   xmm1, xmm0	mulps   xmm5, xmm4	subps   xmm3, xmm1	subps   xmm7, xmm5	mulps   xmm3, xmm2	mulps   xmm7, xmm6	mulps   xmm3, [rsp + nb112_half] ;# rinv H1 - j water 	mulps   xmm7, [rsp + nb112_half] ;# rinv H2 - j water  	;# assemble charges in xmm6 	xorps   xmm6, xmm6	;# do coulomb interaction 	movaps  xmm0, xmm3	movss   xmm6, [rsp + nb112_qqOH]	movaps  xmm4, xmm7	movhps  xmm6, [rsp + nb112_qqHH]	mulps   xmm0, xmm0	;# rinvsq 	mulps   xmm4, xmm4	;# rinvsq 	mulps   xmm3, xmm6	;# vcoul 	mulps   xmm7, xmm6	;# vcoul 	movaps  xmm2, xmm3	addps   xmm2, xmm7	;# total vcoul 	mulps   xmm0, xmm3	;# fscal 		addps   xmm2, [rsp + nb112_vctot]	mulps   xmm7, xmm4	;# fscal 	movaps  [rsp + nb112_vctot], xmm2	movaps  xmm1, xmm0	movaps  xmm2, xmm0	mulps   xmm0, [rsp + nb112_dxH1O]	mulps   xmm1, [rsp + nb112_dyH1O]	mulps   xmm2, [rsp + nb112_dzH1O]	;# update forces H1 - j water 	movaps  xmm3, [rsp + nb112_fjxO]	movaps  xmm4, [rsp + nb112_fjyO]	movaps  xmm5, [rsp + nb112_fjzO]	addps   xmm3, xmm0	addps   xmm4, xmm1	addps   xmm5, xmm2	movaps  [rsp + nb112_fjxO], xmm3	movaps  [rsp + nb112_fjyO], xmm4	movaps  [rsp + nb112_fjzO], xmm5	addps   xmm0, [rsp + nb112_fixH1]	addps   xmm1, [rsp + nb112_fiyH1]	addps   xmm2, [rsp + nb112_fizH1]	movaps  [rsp + nb112_fixH1], xmm0	movaps  [rsp + nb112_fiyH1], xmm1	movaps  [rsp + nb112_fizH1], xmm2	;# do forces H2 - j water 	movaps xmm0, xmm7	movaps xmm1, xmm7	movaps xmm2, xmm7	mulps   xmm0, [rsp + nb112_dxH2O]	mulps   xmm1, [rsp + nb112_dyH2O]	mulps   xmm2, [rsp + nb112_dzH2O]	movaps  xmm3, [rsp + nb112_fjxO]	movaps  xmm4, [rsp + nb112_fjyO]	movaps  xmm5, [rsp + nb112_fjzO]	addps   xmm3, xmm0	addps   xmm4, xmm1	addps   xmm5, xmm2	mov     rsi, [rbp + nb112_faction]	movaps  [rsp + nb112_fjxO], xmm3	movaps  [rsp + nb112_fjyO], xmm4	movaps  [rsp + nb112_fjzO], xmm5	addps   xmm0, [rsp + nb112_fixH2]	addps   xmm1, [rsp + nb112_fiyH2]	addps   xmm2, [rsp + nb112_fizH2]	movaps  [rsp + nb112_fixH2], xmm0	movaps  [rsp + nb112_fiyH2], xmm1	movaps  [rsp + nb112_fizH2], xmm2	;# update j water forces from local variables 	movlps  xmm0, [rsi + rax*4]	movlps  xmm1, [rsi + rax*4 + 12]	movhps  xmm1, [rsi + rax*4 + 24]	movaps  xmm3, [rsp + nb112_fjxO]	movaps  xmm4, [rsp + nb112_fjyO]	movaps  xmm5, [rsp + nb112_fjzO]	movaps  xmm6, xmm5	movaps  xmm7, xmm5	shufps  xmm6, xmm6, 2 ;# 00000010	shufps  xmm7, xmm7, 3 ;# 00000011	addss   xmm5, [rsi + rax*4 + 8]	addss   xmm6, [rsi + rax*4 + 20]	addss   xmm7, [rsi + rax*4 + 32]	movss   [rsi + rax*4 + 8], xmm5	movss   [rsi + rax*4 + 20], xmm6	movss   [rsi + rax*4 + 32], xmm7	movaps   xmm5, xmm3	unpcklps xmm3, xmm4	unpckhps xmm5, xmm4	addps    xmm0, xmm3	addps    xmm1, xmm5	movlps  [rsi + rax*4], xmm0 	movlps  [rsi + rax*4 + 12], xmm1 	movhps  [rsi + rax*4 + 24], xmm1 		dec dword ptr [rsp + nb112_innerk]	jz    .nb112_updateouterdata	jmp   .nb112_single_loop.nb112_updateouterdata:	mov   ecx, [rsp + nb112_ii3]	mov   rdi, [rbp + nb112_faction]	mov   rsi, [rbp + nb112_fshift]	mov   edx, [rsp + nb112_is3]	;# accumulate  Oi forces in xmm0, xmm1, xmm2 	movaps xmm0, [rsp + nb112_fixO]	movaps xmm1, [rsp + nb112_fiyO] 	movaps xmm2, [rsp + nb112_fizO]	movhlps xmm3, xmm0	movhlps xmm4, xmm1

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?