nb_kernel234_x86_64_sse2.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 2,365 行 · 第 1/5 页

S
2,365
字号
        ;# square it	mulpd  xmm4, xmm4	mulpd  xmm5, xmm5	mulpd  xmm6, xmm6       	addpd  xmm4, xmm5	addpd  xmm4, xmm6    ;# rsq in xmm4    	cvtpd2ps xmm5, xmm4		rsqrtps xmm5, xmm5	cvtps2pd xmm2, xmm5	;# lu in low xmm2 	;# lookup seed in xmm2 	movapd xmm5, xmm2	;# copy of lu 	mulpd xmm2, xmm2	;# lu*lu 	movapd xmm1, [rsp + nb234_three]	mulpd xmm2, xmm4	;# rsq*lu*lu 				movapd xmm0, [rsp + nb234_half]	subpd xmm1, xmm2	;# 30-rsq*lu*lu 	mulpd xmm1, xmm5		mulpd xmm1, xmm0	;# xmm0=iter1 of rinv (new lu) 	movapd xmm5, xmm1	;# copy of lu 	mulpd xmm1, xmm1	;# lu*lu 	movapd xmm2, [rsp + nb234_three]	mulpd xmm1, xmm4	;# rsq*lu*lu 				movapd xmm0, [rsp + nb234_half]	subpd xmm2, xmm1	;# 30-rsq*lu*lu 	mulpd xmm2, xmm5		mulpd xmm2, xmm0	;# xmm0=iter2 of rinv (new lu) 		mulpd xmm4, xmm2	;# xmm4=r 	mulpd xmm4, [rsp + nb234_tsc]		cvttpd2pi mm6, xmm4	;# mm6 = lu idx 	cvtpi2pd xmm5, mm6	subpd xmm4, xmm5	movapd xmm1, xmm4    ;# xmm1=eps     ;# xmm2=rinv    movapd xmm3, xmm4   ;# eps	pslld mm6, 3		;# idx *= 8 		mov  rsi, [rbp + nb234_VFtab]	movd r10d, mm6	psrlq mm6, 32	movd r11d, mm6    ;# indices in r10, r11. Load dispersion and repulsion tables in parallel.	movapd xmm4, [rsi + r10*8]          ;# Y1d F1d		movapd xmm0, [rsi + r11*8]         ;# Y2d F2d 	movapd xmm8, [rsi + r10*8 + 32]     ;# Y1r F1r 		movapd xmm3, [rsi + r11*8 + 32]	;# Y2r F2r 	movapd xmm5, xmm4	movapd xmm9, xmm8	unpcklpd xmm4, xmm0	;# Y1d Y2d 	unpckhpd xmm5, xmm0	;# F1d F2d 	unpcklpd xmm8, xmm3	;# Y1r Y2r 	unpckhpd xmm9, xmm3	;# F1r F2r 	movapd xmm6, [rsi + r10*8 + 16]     ;# G1d H1d 		movapd xmm0, [rsi + r11*8 + 16]  	;# G2d H2d 	movapd xmm10, [rsi + r10*8 + 48]	;# G1r H1r 		movapd xmm3, [rsi + r11*8 + 48]	    ;# G2r H2r 	movapd xmm7, xmm6	movapd xmm11, xmm10	unpcklpd xmm6, xmm0	;# G1d G2d 	unpckhpd xmm7, xmm0	;# H1d H2d 	unpcklpd xmm10, xmm3	;# G1r G2r 	unpckhpd xmm11, xmm3	;# H1r H2r 	;# tables ready, in xmm4-xmm7 and xmm8-xmm11    mulpd  xmm7, xmm1    ;# Heps    mulpd  xmm11, xmm1     mulpd  xmm6, xmm1   ;# Geps    mulpd  xmm10, xmm1     mulpd  xmm7, xmm1   ;# Heps2    mulpd  xmm11, xmm1     addpd  xmm5, xmm6  ;# F+Geps    addpd  xmm9, xmm10     addpd  xmm5, xmm7   ;# F+Geps+Heps2 = Fp    addpd  xmm9, xmm11     addpd  xmm7, xmm7    ;# 2*Heps2    addpd  xmm11, xmm11    addpd  xmm7, xmm6   ;# 2*Heps2+Geps    addpd  xmm11, xmm10        addpd  xmm7, xmm5  ;# FF = Fp + 2*Heps2 + Geps    addpd  xmm11, xmm9    mulpd  xmm5, xmm1  ;# eps*Fp    mulpd  xmm9, xmm1    addpd  xmm5, xmm4 ;# VV    addpd  xmm9, xmm8    mulpd  xmm5, [rsp + nb234_c6]  ;# VV*c6 = vnb6    mulpd  xmm9, [rsp + nb234_c12]  ;# VV*c12 = vnb12    addpd  xmm5, xmm9    addpd  xmm5, [rsp + nb234_Vvdwtot]    movapd [rsp + nb234_Vvdwtot], xmm5            mulpd  xmm7, [rsp + nb234_c6]   ;# FF*c6 = fnb6    mulpd  xmm11, [rsp + nb234_c12]   ;# FF*c12  = fnb12    addpd  xmm7, xmm11        mulpd  xmm7, [rsp + nb234_tsc]    mulpd  xmm7, xmm2    xorpd  xmm9, xmm9        subpd xmm9, xmm7    mulpd xmm13, xmm9    mulpd xmm14, xmm9    mulpd xmm15, xmm9        movapd xmm0, [rsp + nb234_fixO]    movapd xmm1, [rsp + nb234_fiyO]    movapd xmm2, [rsp + nb234_fizO]        ;# accumulate i forces    addpd xmm0, xmm13    addpd xmm1, xmm14    addpd xmm2, xmm15    movapd [rsp + nb234_fixO], xmm0    movapd [rsp + nb234_fiyO], xmm1    movapd [rsp + nb234_fizO], xmm2    	;# the fj's - start by accumulating forces from memory 	movlpd xmm3, [rdi + rax*8]	movlpd xmm4, [rdi + rax*8 + 8]	movlpd xmm5, [rdi + rax*8 + 16]	movhpd xmm3, [rdi + rbx*8]	movhpd xmm4, [rdi + rbx*8 + 8]	movhpd xmm5, [rdi + rbx*8 + 16]	addpd xmm3, xmm13	addpd xmm4, xmm14	addpd xmm5, xmm15	movlpd [rdi + rax*8], xmm3	movlpd [rdi + rax*8 + 8], xmm4	movlpd [rdi + rax*8 + 16], xmm5	movhpd [rdi + rbx*8], xmm3	movhpd [rdi + rbx*8 + 8], xmm4	movhpd [rdi + rbx*8 + 16], xmm5    ;# done with OO interaction        ;# move j H1 coordinates to local temp variables     mov rsi, [rbp + nb234_pos]    movlpd xmm0, [rsi + rax*8 + 24]     movlpd xmm1, [rsi + rax*8 + 32]     movlpd xmm2, [rsi + rax*8 + 40]     movhpd xmm0, [rsi + rbx*8 + 24]     movhpd xmm1, [rsi + rbx*8 + 32]     movhpd xmm2, [rsi + rbx*8 + 40]     ;# xmm0 = H1x    ;# xmm1 = H1y    ;# xmm2 = H1z            movapd xmm3, xmm0    movapd xmm4, xmm1    movapd xmm5, xmm2    movapd xmm6, xmm0    movapd xmm7, xmm1    movapd xmm8, xmm2        subpd xmm0, [rsp + nb234_ixH1]    subpd xmm1, [rsp + nb234_iyH1]    subpd xmm2, [rsp + nb234_izH1]    subpd xmm3, [rsp + nb234_ixH2]    subpd xmm4, [rsp + nb234_iyH2]    subpd xmm5, [rsp + nb234_izH2]    subpd xmm6, [rsp + nb234_ixM]    subpd xmm7, [rsp + nb234_iyM]    subpd xmm8, [rsp + nb234_izM]    	movapd [rsp + nb234_dxH1H1], xmm0	movapd [rsp + nb234_dyH1H1], xmm1	movapd [rsp + nb234_dzH1H1], xmm2	mulpd  xmm0, xmm0	mulpd  xmm1, xmm1	mulpd  xmm2, xmm2	movapd [rsp + nb234_dxH2H1], xmm3	movapd [rsp + nb234_dyH2H1], xmm4	movapd [rsp + nb234_dzH2H1], xmm5	mulpd  xmm3, xmm3	mulpd  xmm4, xmm4	mulpd  xmm5, xmm5	movapd [rsp + nb234_dxMH1], xmm6	movapd [rsp + nb234_dyMH1], xmm7	movapd [rsp + nb234_dzMH1], xmm8	mulpd  xmm6, xmm6	mulpd  xmm7, xmm7	mulpd  xmm8, xmm8	addpd  xmm0, xmm1	addpd  xmm0, xmm2	addpd  xmm3, xmm4	addpd  xmm3, xmm5    addpd  xmm6, xmm7    addpd  xmm6, xmm8	;# start doing invsqrt for jH1 atoms    cvtpd2ps xmm1, xmm0    cvtpd2ps xmm4, xmm3    cvtpd2ps xmm7, xmm6	rsqrtps xmm1, xmm1	rsqrtps xmm4, xmm4    rsqrtps xmm7, xmm7    cvtps2pd xmm1, xmm1    cvtps2pd xmm4, xmm4    cvtps2pd xmm7, xmm7		movapd  xmm2, xmm1	movapd  xmm5, xmm4    movapd  xmm8, xmm7    	mulpd   xmm1, xmm1 ;# lu*lu	mulpd   xmm4, xmm4 ;# lu*lu    mulpd   xmm7, xmm7 ;# lu*lu			movapd  xmm9, [rsp + nb234_three]	movapd  xmm10, xmm9    movapd  xmm11, xmm9	mulpd   xmm1, xmm0 ;# rsq*lu*lu	mulpd   xmm4, xmm3 ;# rsq*lu*lu     mulpd   xmm7, xmm6 ;# rsq*lu*lu		subpd   xmm9, xmm1	subpd   xmm10, xmm4    subpd   xmm11, xmm7 ;# 3-rsq*lu*lu	mulpd   xmm9, xmm2	mulpd   xmm10, xmm5    mulpd   xmm11, xmm8 ;# lu*(3-rsq*lu*lu)	movapd  xmm15, [rsp + nb234_half]	mulpd   xmm9, xmm15  ;# first iteration for rinvH1H1 	mulpd   xmm10, xmm15 ;# first iteration for rinvH2H1    mulpd   xmm11, xmm15 ;# first iteration for rinvMH1	    ;# second iteration step    	movapd  xmm2, xmm9	movapd  xmm5, xmm10    movapd  xmm8, xmm11    	mulpd   xmm2, xmm2 ;# lu*lu	mulpd   xmm5, xmm5 ;# lu*lu    mulpd   xmm8, xmm8 ;# lu*lu			movapd  xmm1, [rsp + nb234_three]	movapd  xmm4, xmm1    movapd  xmm7, xmm1	mulpd   xmm2, xmm0 ;# rsq*lu*lu	mulpd   xmm5, xmm3 ;# rsq*lu*lu     mulpd   xmm8, xmm6 ;# rsq*lu*lu		subpd   xmm1, xmm2	subpd   xmm4, xmm5    subpd   xmm7, xmm8 ;# 3-rsq*lu*lu	mulpd   xmm9, xmm1	mulpd   xmm10, xmm4    mulpd   xmm11, xmm7 ;# lu*(3-rsq*lu*lu)	movapd  xmm15, [rsp + nb234_half]	mulpd   xmm9, xmm15  ;#  rinvH1H1 	mulpd   xmm10, xmm15 ;#   rinvH2H1    mulpd   xmm11, xmm15 ;#   rinvMH1		;# H1 interactions     ;# rsq in xmm0,xmm3,xmm6      ;# rinv in xmm9, xmm10, xmm11    movapd xmm1, xmm9 ;# copy of rinv    movapd xmm4, xmm10    movapd xmm7, xmm11    movapd xmm2, [rsp + nb234_krf]        mulpd  xmm9, xmm9   ;# rinvsq    mulpd  xmm10, xmm10    mulpd  xmm11, xmm11    mulpd  xmm0, xmm2  ;# k*rsq    mulpd  xmm3, xmm2    mulpd  xmm6, xmm2    movapd xmm2, xmm0 ;# copy of k*rsq    movapd xmm5, xmm3    movapd xmm8, xmm6    addpd  xmm2, xmm1  ;# rinv+krsq    addpd  xmm5, xmm4    addpd  xmm8, xmm7    movapd xmm14, [rsp + nb234_crf]    subpd  xmm2, xmm14   ;# rinv+krsq-crf    subpd  xmm5, xmm14    subpd  xmm8, xmm14    movapd xmm12, [rsp + nb234_qqHH]    movapd xmm13, [rsp + nb234_qqMH]        mulpd  xmm2, xmm12 ;# voul=qq*(rinv+ krsq-crf)    mulpd  xmm5, xmm12 ;# voul=qq*(rinv+ krsq-crf)    mulpd  xmm8, xmm13 ;# voul=qq*(rinv+ krsq-crf)    addpd  xmm0, xmm0 ;# 2*krsq    addpd  xmm3, xmm3     addpd  xmm6, xmm6     subpd  xmm1, xmm0 ;# rinv-2*krsq    subpd  xmm4, xmm3    subpd  xmm7, xmm6    mulpd  xmm1, xmm12   ;# (rinv-2*krsq)*qq    mulpd  xmm4, xmm12    mulpd  xmm7, xmm13    addpd  xmm2, [rsp + nb234_vctot]    addpd  xmm5, xmm8    addpd  xmm2, xmm5    movapd [rsp + nb234_vctot], xmm2        mulpd  xmm9, xmm1   ;# fscal    mulpd  xmm10, xmm4    mulpd  xmm11, xmm7    ;# move j H1 forces to xmm0-xmm2    mov rdi, [rbp + nb234_faction]	movlpd xmm0, [rdi + rax*8 + 24]	movlpd xmm1, [rdi + rax*8 + 32]	movlpd xmm2, [rdi + rax*8 + 40]	movhpd xmm0, [rdi + rbx*8 + 24]	movhpd xmm1, [rdi + rbx*8 + 32]	movhpd xmm2, [rdi + rbx*8 + 40]    movapd xmm7, xmm9    movapd xmm8, xmm9    movapd xmm13, xmm11    movapd xmm14, xmm11    movapd xmm15, xmm11    movapd xmm11, xmm10    movapd xmm12, xmm10	mulpd xmm7, [rsp + nb234_dxH1H1]	mulpd xmm8, [rsp + nb234_dyH1H1]	mulpd xmm9, [rsp + nb234_dzH1H1]	mulpd xmm10, [rsp + nb234_dxH2H1]	mulpd xmm11, [rsp + nb234_dyH2H1]	mulpd xmm12, [rsp + nb234_dzH2H1]	mulpd xmm13, [rsp + nb234_dxMH1]	mulpd xmm14, [rsp + nb234_dyMH1]	mulpd xmm15, [rsp + nb234_dzMH1]    addpd xmm0, xmm7    addpd xmm1, xmm8    addpd xmm2, xmm9    addpd xmm7, [rsp + nb234_fixH1]    addpd xmm8, [rsp + nb234_fiyH1]    addpd xmm9, [rsp + nb234_fizH1]    addpd xmm0, xmm10    addpd xmm1, xmm11    addpd xmm2, xmm12    addpd xmm10, [rsp + nb234_fixH2]    addpd xmm11, [rsp + nb234_fiyH2]    addpd xmm12, [rsp + nb234_fizH2]    addpd xmm0, xmm13    addpd xmm1, xmm14    addpd xmm2, xmm15    addpd xmm13, [rsp + nb234_fixM]    addpd xmm14, [rsp + nb234_fiyM]    addpd xmm15, [rsp + nb234_fizM]    movapd [rsp + nb234_fixH1], xmm7    movapd [rsp + nb234_fiyH1], xmm8    movapd [rsp + nb234_fizH1], xmm9    movapd [rsp + nb234_fixH2], xmm10    movapd [rsp + nb234_fiyH2], xmm11    movapd [rsp + nb234_fizH2], xmm12    movapd [rsp + nb234_fixM], xmm13    movapd [rsp + nb234_fiyM], xmm14    movapd [rsp + nb234_fizM], xmm15       ;# store back j H1 forces from xmm0-xmm2	movlpd [rdi + rax*8 + 24], xmm0	movlpd [rdi + rax*8 + 32], xmm1	movlpd [rdi + rax*8 + 40], xmm2	movhpd [rdi + rbx*8 + 24], xmm0	movhpd [rdi + rbx*8 + 32], xmm1	movhpd [rdi + rbx*8 + 40], xmm2	;# move j H2 coordinates to local temp variables     mov rsi, [rbp + nb234_pos]    movlpd xmm0, [rsi + rax*8 + 48]     movlpd xmm1, [rsi + rax*8 + 56]     movlpd xmm2, [rsi + rax*8 + 64]     movhpd xmm0, [rsi + rbx*8 + 48]     movhpd xmm1, [rsi + rbx*8 + 56]     movhpd xmm2, [rsi + rbx*8 + 64]     ;# xmm0 = H2x    ;# xmm1 = H2y    ;# xmm2 = H2z            movapd xmm3, xmm0    movapd xmm4, xmm1    movapd xmm5, xmm2    movapd xmm6, xmm0    movapd xmm7, xmm1    movapd xmm8, xmm2        subpd xmm0, [rsp + nb234_ixH1]    subpd xmm1, [rsp + nb234_iyH1]    subpd xmm2, [rsp + nb234_izH1]    subpd xmm3, [rsp + nb234_ixH2]    subpd xmm4, [rsp + nb234_iyH2]    subpd xmm5, [rsp + nb234_izH2]    subpd xmm6, [rsp + nb234_ixM]    subpd xmm7, [rsp + nb234_iyM]    subpd xmm8, [rsp + nb234_izM]    	movapd [rsp + nb234_dxH1H2], xmm0	movapd [rsp + nb234_dyH1H2], xmm1	movapd [rsp + nb234_dzH1H2], xmm2	mulpd  xmm0, xmm0	mulpd  xmm1, xmm1	mulpd  xmm2, xmm2	movapd [rsp + nb234_dxH2H2], xmm3	movapd [rsp + nb234_dyH2H2], xmm4	movapd [rsp + nb234_dzH2H2], xmm5	mulpd  xmm3, xmm3	mulpd  xmm4, xmm4	mulpd  xmm5, xmm5	movapd [rsp + nb234_dxMH2], xmm6	movapd [rsp + nb234_dyMH2], xmm7	movapd [rsp + nb234_dzMH2], xmm8	mulpd  xmm6, xmm6	mulpd  xmm7, xmm7	mulpd  xmm8, xmm8	addpd  xmm0, xmm1	addpd  xmm0, xmm2	addpd  xmm3, xmm4	addpd  xmm3, xmm5    addpd  xmm6, xmm7    addpd  xmm6, xmm8	;# start doing invsqrt for jH2 atoms    cvtpd2ps xmm1, xmm0    cvtpd2ps xmm4, xmm3    cvtpd2ps xmm7, xmm6	rsqrtps xmm1, xmm1	rsqrtps xmm4, xmm4    rsqrtps xmm7, xmm7    cvtps2pd xmm1, xmm1    cvtps2pd xmm4, xmm4    cvtps2pd xmm7, xmm7		movapd  xmm2, xmm1	movapd  xmm5, xmm4    movapd  xmm8, xmm7    	mulpd   xmm1, xmm1 ;# lu*lu	mulpd   xmm4, xmm4 ;# lu*lu    mulpd   xmm7, xmm7 ;# lu*lu			movapd  xmm9, [rsp + nb234_three]	movapd  xmm10, xmm9    movapd  xmm11, xmm9	mulpd   xmm1, xmm0 ;# rsq*lu*lu	mulpd   xmm4, xmm3 ;# rsq*lu*lu     mulpd   xmm7, xmm6 ;# rsq*lu*lu		subpd   xmm9, xmm1	subpd   xmm10, xmm4    subpd   xmm11, xmm7 ;# 3-rsq*lu*lu	mulpd   xmm9, xmm2	mulpd   xmm10, xmm5    mulpd   xmm11, xmm8 ;# lu*(3-rsq*lu*lu)

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?