nb_kernel332_ia32_sse2.intel_syntax.s

来自「最著名最快的分子模拟软件」· S 代码 · 共 2,183 行 · 第 1/5 页

S
2,183
字号
	movapd xmm0, [esp + nb332_ixO]	movapd xmm1, [esp + nb332_iyO]	movapd xmm2, [esp + nb332_izO]	movapd xmm3, [esp + nb332_ixO]	movapd xmm4, [esp + nb332_iyO]	movapd xmm5, [esp + nb332_izO]	subpd  xmm0, [esp + nb332_jxO]	subpd  xmm1, [esp + nb332_jyO]	subpd  xmm2, [esp + nb332_jzO]	subpd  xmm3, [esp + nb332_jxH1]	subpd  xmm4, [esp + nb332_jyH1]	subpd  xmm5, [esp + nb332_jzH1]	movapd [esp + nb332_dxOO], xmm0	movapd [esp + nb332_dyOO], xmm1	movapd [esp + nb332_dzOO], xmm2	mulpd  xmm0, xmm0	mulpd  xmm1, xmm1	mulpd  xmm2, xmm2	movapd [esp + nb332_dxOH1], xmm3	movapd [esp + nb332_dyOH1], xmm4	movapd [esp + nb332_dzOH1], xmm5	mulpd  xmm3, xmm3	mulpd  xmm4, xmm4	mulpd  xmm5, xmm5	addpd  xmm0, xmm1	addpd  xmm0, xmm2	addpd  xmm3, xmm4	addpd  xmm3, xmm5	movapd [esp + nb332_rsqOO], xmm0	movapd [esp + nb332_rsqOH1], xmm3	movapd xmm0, [esp + nb332_ixO]	movapd xmm1, [esp + nb332_iyO]	movapd xmm2, [esp + nb332_izO]	movapd xmm3, [esp + nb332_ixH1]	movapd xmm4, [esp + nb332_iyH1]	movapd xmm5, [esp + nb332_izH1]	subpd  xmm0, [esp + nb332_jxH2]	subpd  xmm1, [esp + nb332_jyH2]	subpd  xmm2, [esp + nb332_jzH2]	subpd  xmm3, [esp + nb332_jxO]	subpd  xmm4, [esp + nb332_jyO]	subpd  xmm5, [esp + nb332_jzO]	movapd [esp + nb332_dxOH2], xmm0	movapd [esp + nb332_dyOH2], xmm1	movapd [esp + nb332_dzOH2], xmm2	mulpd  xmm0, xmm0	mulpd  xmm1, xmm1	mulpd  xmm2, xmm2	movapd [esp + nb332_dxH1O], xmm3	movapd [esp + nb332_dyH1O], xmm4	movapd [esp + nb332_dzH1O], xmm5	mulpd  xmm3, xmm3	mulpd  xmm4, xmm4	mulpd  xmm5, xmm5	addpd  xmm0, xmm1	addpd  xmm0, xmm2	addpd  xmm3, xmm4	addpd  xmm3, xmm5	movapd [esp + nb332_rsqOH2], xmm0	movapd [esp + nb332_rsqH1O], xmm3	movapd xmm0, [esp + nb332_ixH1]	movapd xmm1, [esp + nb332_iyH1]	movapd xmm2, [esp + nb332_izH1]	movapd xmm3, [esp + nb332_ixH1]	movapd xmm4, [esp + nb332_iyH1]	movapd xmm5, [esp + nb332_izH1]	subpd  xmm0, [esp + nb332_jxH1]	subpd  xmm1, [esp + nb332_jyH1]	subpd  xmm2, [esp + nb332_jzH1]	subpd  xmm3, [esp + nb332_jxH2]	subpd  xmm4, [esp + nb332_jyH2]	subpd  xmm5, [esp + nb332_jzH2]	movapd [esp + nb332_dxH1H1], xmm0	movapd [esp + nb332_dyH1H1], xmm1	movapd [esp + nb332_dzH1H1], xmm2	mulpd  xmm0, xmm0	mulpd  xmm1, xmm1	mulpd  xmm2, xmm2	movapd [esp + nb332_dxH1H2], xmm3	movapd [esp + nb332_dyH1H2], xmm4	movapd [esp + nb332_dzH1H2], xmm5	mulpd  xmm3, xmm3	mulpd  xmm4, xmm4	mulpd  xmm5, xmm5	addpd  xmm0, xmm1	addpd  xmm0, xmm2	addpd  xmm3, xmm4	addpd  xmm3, xmm5	movapd [esp + nb332_rsqH1H1], xmm0	movapd [esp + nb332_rsqH1H2], xmm3	movapd xmm0, [esp + nb332_ixH2]	movapd xmm1, [esp + nb332_iyH2]	movapd xmm2, [esp + nb332_izH2]	movapd xmm3, [esp + nb332_ixH2]	movapd xmm4, [esp + nb332_iyH2]	movapd xmm5, [esp + nb332_izH2]	subpd  xmm0, [esp + nb332_jxO]	subpd  xmm1, [esp + nb332_jyO]	subpd  xmm2, [esp + nb332_jzO]	subpd  xmm3, [esp + nb332_jxH1]	subpd  xmm4, [esp + nb332_jyH1]	subpd  xmm5, [esp + nb332_jzH1]	movapd [esp + nb332_dxH2O], xmm0	movapd [esp + nb332_dyH2O], xmm1	movapd [esp + nb332_dzH2O], xmm2	mulpd  xmm0, xmm0	mulpd  xmm1, xmm1	mulpd  xmm2, xmm2	movapd [esp + nb332_dxH2H1], xmm3	movapd [esp + nb332_dyH2H1], xmm4	movapd [esp + nb332_dzH2H1], xmm5	mulpd  xmm3, xmm3	mulpd  xmm4, xmm4	mulpd  xmm5, xmm5	addpd  xmm0, xmm1	addpd  xmm0, xmm2	addpd  xmm4, xmm3	addpd  xmm4, xmm5	movapd [esp + nb332_rsqH2O], xmm0	movapd [esp + nb332_rsqH2H1], xmm4	movapd xmm0, [esp + nb332_ixH2]	movapd xmm1, [esp + nb332_iyH2]	movapd xmm2, [esp + nb332_izH2]	subpd  xmm0, [esp + nb332_jxH2]	subpd  xmm1, [esp + nb332_jyH2]	subpd  xmm2, [esp + nb332_jzH2]	movapd [esp + nb332_dxH2H2], xmm0	movapd [esp + nb332_dyH2H2], xmm1	movapd [esp + nb332_dzH2H2], xmm2	mulpd xmm0, xmm0	mulpd xmm1, xmm1	mulpd xmm2, xmm2	addpd xmm0, xmm1	addpd xmm0, xmm2	movapd [esp + nb332_rsqH2H2], xmm0			;# start doing invsqrt use rsq values in xmm0, xmm4 	cvtpd2ps xmm1, xmm0		cvtpd2ps xmm5, xmm4		rsqrtps xmm1, xmm1	rsqrtps xmm5, xmm5	cvtps2pd xmm1, xmm1	cvtps2pd xmm5, xmm5		movapd  xmm2, xmm1	;# copy of luA 	movapd  xmm6, xmm5	;# copy of luB 	mulpd   xmm1, xmm1	;# luA*luA 	mulpd   xmm5, xmm5	;# luB*luB 	movapd  xmm3, [esp + nb332_three]	mulpd   xmm1, xmm0	;# rsqA*luA*luA 	mulpd   xmm5, xmm4	;# rsqB*luB*luB 		movapd  xmm7, xmm3	subpd   xmm3, xmm1	;# 3-rsqA*luA*luA 	subpd   xmm7, xmm5	;# 3-rsqB*luB*luB 	mulpd   xmm3, xmm2	;# luA*(3-rsqA*luA*luA) 	mulpd   xmm7, xmm6	;# luB*(3-rsqB*luB*luB) 	mulpd   xmm3, [esp + nb332_half] ;# iter1 	mulpd   xmm7, [esp + nb332_half] ;# iter1 	movapd  xmm2, xmm3	;# copy of luA 	movapd  xmm6, xmm7	;# copy of luB 	mulpd   xmm3, xmm3	;# luA*luA 	mulpd   xmm7, xmm7	;# luB*luB 	movapd  xmm1, [esp + nb332_three]	mulpd   xmm3, xmm0	;# rsqA*luA*luA 	mulpd   xmm7, xmm4	;# rsqB*luB*luB 		movapd  xmm5, xmm1	subpd   xmm1, xmm3	;# 3-rsqA*luA*luA 	subpd   xmm5, xmm7	;# 3-rsqB*luB*luB 	mulpd   xmm1, xmm2	;# luA*(3-rsqA*luA*luA) 	mulpd   xmm5, xmm6	;# luB*(3-rsqB*luB*luB) 	mulpd   xmm1, [esp + nb332_half] ;# rinv 	mulpd   xmm5, [esp + nb332_half] ;# rinv 	movapd [esp + nb332_rinvH2H2], xmm1	movapd [esp + nb332_rinvH2H1], xmm5	movapd xmm0, [esp + nb332_rsqOO]	movapd xmm4, [esp + nb332_rsqOH1]		cvtpd2ps xmm1, xmm0		cvtpd2ps xmm5, xmm4		rsqrtps xmm1, xmm1	rsqrtps xmm5, xmm5	cvtps2pd xmm1, xmm1	cvtps2pd xmm5, xmm5		movapd  xmm2, xmm1	;# copy of luA 	movapd  xmm6, xmm5	;# copy of luB 	mulpd   xmm1, xmm1	;# luA*luA 	mulpd   xmm5, xmm5	;# luB*luB 	movapd  xmm3, [esp + nb332_three]	mulpd   xmm1, xmm0	;# rsqA*luA*luA 	mulpd   xmm5, xmm4	;# rsqB*luB*luB 		movapd  xmm7, xmm3	subpd   xmm3, xmm1	;# 3-rsqA*luA*luA 	subpd   xmm7, xmm5	;# 3-rsqB*luB*luB 	mulpd   xmm3, xmm2	;# luA*(3-rsqA*luA*luA) 	mulpd   xmm7, xmm6	;# luB*(3-rsqB*luB*luB) 	mulpd   xmm3, [esp + nb332_half] ;# iter1 of  	mulpd   xmm7, [esp + nb332_half] ;# iter1 of  	movapd  xmm2, xmm3	;# copy of luA 	movapd  xmm6, xmm7	;# copy of luB 	mulpd   xmm3, xmm3	;# luA*luA 	mulpd   xmm7, xmm7	;# luB*luB 	movapd  xmm1, [esp + nb332_three]	mulpd   xmm3, xmm0	;# rsqA*luA*luA 	mulpd   xmm7, xmm4	;# rsqB*luB*luB 		movapd  xmm5, xmm1	subpd   xmm1, xmm3	;# 3-rsqA*luA*luA 	subpd   xmm5, xmm7	;# 3-rsqB*luB*luB 	mulpd   xmm1, xmm2	;# luA*(3-rsqA*luA*luA) 	mulpd   xmm5, xmm6	;# luB*(3-rsqB*luB*luB) 	mulpd   xmm1, [esp + nb332_half] ;# rinv 	mulpd   xmm5, [esp + nb332_half] ;# rinv	movapd [esp + nb332_rinvOO], xmm1	movapd [esp + nb332_rinvOH1], xmm5	movapd xmm0, [esp + nb332_rsqOH2]	movapd xmm4, [esp + nb332_rsqH1O]		cvtpd2ps xmm1, xmm0		cvtpd2ps xmm5, xmm4		rsqrtps xmm1, xmm1	rsqrtps xmm5, xmm5	cvtps2pd xmm1, xmm1	cvtps2pd xmm5, xmm5		movapd  xmm2, xmm1	;# copy of luA 	movapd  xmm6, xmm5	;# copy of luB 	mulpd   xmm1, xmm1	;# luA*luA 	mulpd   xmm5, xmm5	;# luB*luB 	movapd  xmm3, [esp + nb332_three]	mulpd   xmm1, xmm0	;# rsqA*luA*luA 	mulpd   xmm5, xmm4	;# rsqB*luB*luB 		movapd  xmm7, xmm3	subpd   xmm3, xmm1	;# 3-rsqA*luA*luA 	subpd   xmm7, xmm5	;# 3-rsqB*luB*luB 	mulpd   xmm3, xmm2	;# luA*(3-rsqA*luA*luA) 	mulpd   xmm7, xmm6	;# luB*(3-rsqB*luB*luB) 	mulpd   xmm3, [esp + nb332_half] ;# iter1 	mulpd   xmm7, [esp + nb332_half] ;# iter1 	movapd  xmm2, xmm3	;# copy of luA 	movapd  xmm6, xmm7	;# copy of luB 	mulpd   xmm3, xmm3	;# luA*luA 	mulpd   xmm7, xmm7	;# luB*luB 	movapd  xmm1, [esp + nb332_three]	mulpd   xmm3, xmm0	;# rsqA*luA*luA 	mulpd   xmm7, xmm4	;# rsqB*luB*luB 		movapd  xmm5, xmm1	subpd   xmm1, xmm3	;# 3-rsqA*luA*luA 	subpd   xmm5, xmm7	;# 3-rsqB*luB*luB 	mulpd   xmm1, xmm2	;# luA*(3-rsqA*luA*luA) 	mulpd   xmm5, xmm6	;# luB*(3-rsqB*luB*luB) 	mulpd   xmm1, [esp + nb332_half] ;# rinv 	mulpd   xmm5, [esp + nb332_half] ;# rinv 	movapd [esp + nb332_rinvOH2], xmm1	movapd [esp + nb332_rinvH1O], xmm5	movapd xmm0, [esp + nb332_rsqH1H1]	movapd xmm4, [esp + nb332_rsqH1H2]		cvtpd2ps xmm1, xmm0		cvtpd2ps xmm5, xmm4		rsqrtps xmm1, xmm1	rsqrtps xmm5, xmm5	cvtps2pd xmm1, xmm1	cvtps2pd xmm5, xmm5		movapd  xmm2, xmm1	;# copy of luA 	movapd  xmm6, xmm5	;# copy of luB 	mulpd   xmm1, xmm1	;# luA*luA 	mulpd   xmm5, xmm5	;# luB*luB 	movapd  xmm3, [esp + nb332_three]	mulpd   xmm1, xmm0	;# rsqA*luA*luA 	mulpd   xmm5, xmm4	;# rsqB*luB*luB 		movapd  xmm7, xmm3	subpd   xmm3, xmm1	;# 3-rsqA*luA*luA 	subpd   xmm7, xmm5	;# 3-rsqB*luB*luB 	mulpd   xmm3, xmm2	;# luA*(3-rsqA*luA*luA) 	mulpd   xmm7, xmm6	;# luB*(3-rsqB*luB*luB) 	mulpd   xmm3, [esp + nb332_half] ;# iter1a 	mulpd   xmm7, [esp + nb332_half] ;# iter1b 	movapd  xmm2, xmm3	;# copy of luA 	movapd  xmm6, xmm7	;# copy of luB 	mulpd   xmm3, xmm3	;# luA*luA 	mulpd   xmm7, xmm7	;# luB*luB 	movapd  xmm1, [esp + nb332_three]	mulpd   xmm3, xmm0	;# rsqA*luA*luA 	mulpd   xmm7, xmm4	;# rsqB*luB*luB 		movapd  xmm5, xmm1	subpd   xmm1, xmm3	;# 3-rsqA*luA*luA 	subpd   xmm5, xmm7	;# 3-rsqB*luB*luB 	mulpd   xmm1, xmm2	;# luA*(3-rsqA*luA*luA) 	mulpd   xmm5, xmm6	;# luB*(3-rsqB*luB*luB) 	mulpd   xmm1, [esp + nb332_half] ;# rinv 	mulpd   xmm5, [esp + nb332_half] ;# rinv 	movapd [esp + nb332_rinvH1H1], xmm1	movapd [esp + nb332_rinvH1H2], xmm5	movapd xmm0, [esp + nb332_rsqH2O]	cvtpd2ps xmm1, xmm0		rsqrtps xmm1, xmm1	cvtps2pd xmm1, xmm1		movapd  xmm2, xmm1	;# copy of luA 	mulpd   xmm1, xmm1	;# luA*luA 	movapd  xmm3, [esp + nb332_three]	mulpd   xmm1, xmm0	;# rsqA*luA*luA 	subpd   xmm3, xmm1	;# 3-rsqA*luA*luA 	mulpd   xmm3, xmm2	;# luA*(3-rsqA*luA*luA) 	mulpd   xmm3, [esp + nb332_half] ;# iter1 	movapd  xmm2, xmm3	;# copy of luA 	mulpd   xmm3, xmm3	;# luA*luA 	movapd  xmm1, [esp + nb332_three]	mulpd   xmm3, xmm0	;# rsqA*luA*luA 	subpd   xmm1, xmm3	;# 3-rsqA*luA*luA 	mulpd   xmm1, xmm2	;# luA*(3-rsqA*luA*luA) 	mulpd   xmm1, [esp + nb332_half] ;# rinv 	movapd [esp + nb332_rinvH2O], xmm1		;# start with OO interaction 	movapd xmm0, [esp + nb332_rinvOO]	movapd xmm1, xmm0	mulpd  xmm1, [esp + nb332_rsqOO] ;# xmm1=r 	mulpd  xmm1, [esp + nb332_tsc]	cvttpd2pi mm6, xmm1	;# mm6 = lu idx 	cvtpi2pd xmm6, mm6	subpd xmm1, xmm6	;# xmm1=eps 	movapd xmm2, xmm1		mulpd  xmm2, xmm2	;# xmm2=eps2 		pslld mm6, 2		;# idx *= 4 	movd mm0, eax		movd mm1, ebx	mov  esi, [ebp + nb332_VFtab]	movd eax, mm6	psrlq mm6, 32	movd ebx, mm6		;# indices in eax/ebx 	lea   eax, [eax + eax*2]	;# idx*=3 (12 total now) 	lea   ebx, [ebx + ebx*2]	;# idx*=3 (12 total now) 	movlpd xmm4, [esi + eax*8]	;# Y1		movlpd xmm3, [esi + ebx*8]	;# Y2	movhpd xmm4, [esi + eax*8 + 8]	;# Y1 F1 		movhpd xmm3, [esi + ebx*8 + 8]	;# Y2 F2 	movapd xmm5, xmm4	unpcklpd xmm4, xmm3	;# Y1 Y2 	unpckhpd xmm5, xmm3	;# F1 F2 	movlpd xmm6, [esi + eax*8 + 16]	;# G1	movlpd xmm3, [esi + ebx*8 + 16]	;# G2	movhpd xmm6, [esi + eax*8 + 24]	;# G1 H1 		movhpd xmm3, [esi + ebx*8 + 24]	;# G2 H2 	movapd xmm7, xmm6	unpcklpd xmm6, xmm3	;# G1 G2 	unpckhpd xmm7, xmm3	;# H1 H2 	;# coulomb table ready, in xmm4-xmm7  			mulpd  xmm6, xmm1	;# xmm6=Geps 	mulpd  xmm7, xmm2	;# xmm7=Heps2 	addpd  xmm5, xmm6	addpd  xmm5, xmm7	;# xmm5=Fp 		mulpd  xmm7, [esp + nb332_two]	;# two*Heps2 	movapd xmm3, [esp + nb332_qqOO]	addpd  xmm7, xmm6	addpd  xmm7, xmm5 ;# xmm7=FF 	mulpd  xmm5, xmm1 ;# xmm5=eps*Fp 	addpd  xmm5, xmm4 ;# xmm5=VV 	mulpd  xmm5, xmm3 ;# vcoul=qq*VV  	mulpd  xmm3, xmm7 ;# fijC=FF*qq     ;# at this point mm5 contains vcoul and xmm3 fijC     ;# increment vcoul - then we can get rid of mm5     ;# update vctot     addpd  xmm5, [esp + nb332_vctot]    movapd [esp + nb332_vctot], xmm5	;# put scalar force on stack temporarily 	movapd [esp + nb332_fscal], xmm3	;# Dispersion 	movlpd xmm4, [esi + eax*8 + 32]	;# Y1	movlpd xmm3, [esi + ebx*8 + 32]	;# Y2	movhpd xmm4, [esi + eax*8 + 40]	;# Y1 F1 		movhpd xmm3, [esi + ebx*8 + 40]	;# Y2 F2 	movapd xmm5, xmm4	unpcklpd xmm4, xmm3	;# Y1 Y2 	unpckhpd xmm5, xmm3	;# F1 F2 	movlpd xmm6, [esi + eax*8 + 48]	;# G1	movlpd xmm3, [esi + ebx*8 + 48]	;# G2	movhpd xmm6, [esi + eax*8 + 56]	;# G1 H1 		movhpd xmm3, [esi + ebx*8 + 56]	;# G2 H2 	movapd xmm7, xmm6	unpcklpd xmm6, xmm3	;# G1 G2 	unpckhpd xmm7, xmm3	;# H1 H2 	;# Dispersion table ready, in xmm4-xmm7  			mulpd  xmm6, xmm1	;# xmm6=Geps 	mulpd  xmm7, xmm2	;# xmm7=Heps2 	addpd  xmm5, xmm6	addpd  xmm5, xmm7	;# xmm5=Fp 		mulpd  xmm7, [esp + nb332_two]	;# two*Heps2 	addpd  xmm7, xmm6	addpd  xmm7, xmm5 ;# xmm7=FF 	mulpd  xmm5, xmm1 ;# xmm5=eps*Fp 	addpd  xmm5, xmm4 ;# xmm5=VV 	movapd xmm4, [esp + nb332_c6]	mulpd  xmm7, xmm4	 ;# fijD 	mulpd  xmm5, xmm4	 ;# Vvdw6 	addpd  xmm7, [esp + nb332_fscal] ;# add to fscal 	;# put scalar force back on stack Update Vvdwtot directly 	addpd  xmm5, [esp + nb332_Vvdwtot]	movapd [esp + nb332_fscal], xmm7	movapd [esp + nb332_Vvdwtot], xmm5	;# Repulsion 	movlpd xmm4, [esi + eax*8 + 64]	;# Y1	movlpd xmm3, [esi + ebx*8 + 64]	;# Y2	movhpd xmm4, [esi + eax*8 + 72]	;# Y1 F1 		movhpd xmm3, [esi + ebx*8 + 72]	;# Y2 F2 	movapd xmm5, xmm4	unpcklpd xmm4, xmm3	;# Y1 Y2 	unpckhpd xmm5, xmm3	;# F1 F2 	movlpd xmm6, [esi + eax*8 + 80]	;# G1	movlpd xmm3, [esi + ebx*8 + 80]	;# G2	movhpd xmm6, [esi + eax*8 + 88]	;# G1 H1 		movhpd xmm3, [esi + ebx*8 + 88]	;# G2 H2 

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?